{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f2c16396",
   "metadata": {},
   "source": [
    "### Screen Recording Interaction with Qwen2.5-Omni\n",
    "\n",
    "This notebook demonstrates how to use Qwen2.5-Omni to get the information and content you want to know by asking questions in real time on the recording screen."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "638e9082-c1ef-4efd-9a10-e35507e25363",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-01-29T12:40:04.049566Z",
     "iopub.status.busy": "2025-01-29T12:40:04.049365Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!pip install git+https://github.com/huggingface/transformers\n",
    "!pip install qwen-omni-utils\n",
    "!pip install openai\n",
    "!pip install flash-attn --no-build-isolation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9596c50d-80a8-433f-b846-1fbf61145ccc",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2025-01-29T12:40:16.511701Z",
     "iopub.status.busy": "2025-01-29T12:40:16.510916Z",
     "iopub.status.idle": "2025-01-29T12:40:16.878038Z",
     "shell.execute_reply": "2025-01-29T12:40:16.877543Z",
     "shell.execute_reply.started": "2025-01-29T12:40:16.511678Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/omni/lib/python3.10/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from qwen_omni_utils import process_mm_info\n",
    "\n",
    "# @title inference function\n",
    "def inference(video_path, prompt, sys_prompt):\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": sys_prompt},\n",
    "        {\"role\": \"user\", \"content\": [\n",
    "                {\"type\": \"text\", \"text\": prompt},\n",
    "                {\"type\": \"video\", \"video\": video_path},\n",
    "            ]\n",
    "        },\n",
    "    ]\n",
    "    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "    # image_inputs, video_inputs = process_vision_info([messages])\n",
    "    audios, images, videos = process_mm_info(messages, use_audio_in_video=False)\n",
    "    inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors=\"pt\", padding=True)\n",
    "    inputs = inputs.to(model.device).to(model.dtype)\n",
    "\n",
    "    output = model.generate(**inputs, use_audio_in_video=False, return_audio=False)\n",
    "\n",
    "    text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)\n",
    "    return text"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "386e4cd8",
   "metadata": {},
   "source": [
    "Load model and processors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e829b782-0be7-4bc6-a576-6b815323376e",
   "metadata": {
    "ExecutionIndicator": {
     "show": false
    },
    "execution": {
     "iopub.execute_input": "2025-01-29T12:40:18.337731Z",
     "iopub.status.busy": "2025-01-29T12:40:18.337470Z",
     "iopub.status.idle": "2025-01-29T12:40:47.760976Z",
     "shell.execute_reply": "2025-01-29T12:40:47.760220Z",
     "shell.execute_reply.started": "2025-01-29T12:40:18.337713Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/omni/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2025-03-22 17:14:12.353632: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2025-03-22 17:14:12.386228: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "2025-03-22 17:14:12.386249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "2025-03-22 17:14:12.387082: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2025-03-22 17:14:12.392644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2025-03-22 17:14:13.131254: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00,  1.14s/it]\n",
      "/opt/conda/envs/omni/lib/python3.10/site-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:6129: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
      "  for key, value in torch.load(path).items():\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor\n",
    "\n",
    "model_path = \"Qwen/Qwen2.5-Omni-7B\"\n",
    "model = Qwen2_5OmniModel.from_pretrained(\n",
    "    model_path,\n",
    "    torch_dtype=torch.bfloat16,\n",
    "    device_map=\"auto\",\n",
    "    attn_implementation=\"flash_attention_2\",\n",
    ")\n",
    "processor = Qwen2_5OmniProcessor.from_pretrained(model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ed93fb82",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Video"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a47ad45",
   "metadata": {},
   "source": [
    "#### Understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1935af5e",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2025-01-29T12:41:18.150397Z",
     "iopub.status.busy": "2025-01-29T12:41:18.149631Z",
     "iopub.status.idle": "2025-01-29T12:41:19.978329Z",
     "shell.execute_reply": "2025-01-29T12:41:19.977054Z",
     "shell.execute_reply.started": "2025-01-29T12:41:18.150371Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
       "      Your browser does not support the <code>video</code> element.\n",
       "    </video>"
      ],
      "text/plain": [
       "<IPython.core.display.Video object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n",
      "qwen-vl-utils using torchvision to read video.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "system\n",
      "You are a helpful assistant.\n",
      "user\n",
      "What the browser is used in this video?\n",
      "assistant\n",
      "The browser used in the video is Google Chrome.\n"
     ]
    }
   ],
   "source": [
    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
    "prompt = \"What the browser is used in this video?\"\n",
    "\n",
    "display(Video(video_path, width=640, height=360))\n",
    "\n",
    "## Use a local HuggingFace model to inference.\n",
    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
    "print(response[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9961aae",
   "metadata": {},
   "source": [
    "#### OCR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0894f5f1",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2025-01-29T12:44:01.387553Z",
     "iopub.status.busy": "2025-01-29T12:44:01.386725Z",
     "iopub.status.idle": "2025-01-29T12:44:09.671782Z",
     "shell.execute_reply": "2025-01-29T12:44:09.671200Z",
     "shell.execute_reply.started": "2025-01-29T12:44:01.387530Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
       "      Your browser does not support the <code>video</code> element.\n",
       "    </video>"
      ],
      "text/plain": [
       "<IPython.core.display.Video object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "system\n",
      "You are a helpful assistant.\n",
      "user\n",
      "Who is the authors of this paper?\n",
      "assistant\n",
      "The authors of the paper \"Attention Is All You Need\" are:\n",
      "\n",
      "1. Ashish Vaswani\n",
      "2. Noam Shazeer\n",
      "3. Niki Parmar\n",
      "4. Jakob Uszkoreit\n",
      "5. Llion Jones\n",
      "6. Aidan N. Gomez\n",
      "7. Lukasz Kaiser\n",
      "8. Illia Polosukhin\n"
     ]
    }
   ],
   "source": [
    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
    "prompt = \"Who is the authors of this paper?\"\n",
    "\n",
    "display(Video(video_path, width=640, height=360))\n",
    "\n",
    "## Use a local HuggingFace model to inference.\n",
    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
    "print(response[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e9b7651",
   "metadata": {},
   "source": [
    "#### Summarize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16aa3dc5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
       "      Your browser does not support the <code>video</code> element.\n",
       "    </video>"
      ],
      "text/plain": [
       "<IPython.core.display.Video object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "system\n",
      "You are a helpful assistant.\n",
      "user\n",
      "Summarize this paper in short.\n",
      "assistant\n",
      "The paper \"Attention Is All You Need\" introduces the Transformer model, a novel architecture for sequence-to-sequence tasks that relies entirely on self-attention mechanisms. The Transformer outperforms existing models in machine translation tasks, achieving state-of-the-art BLEU scores on WMT 2014 English-German and English-French translation tasks. The model is highly parallelizable, allowing for efficient training on large datasets and GPUs. The paper also discusses the application of the Transformer to other tasks, such as English constituency parsing, and highlights its potential for handling large inputs and outputs, such as images and audio.\n"
     ]
    }
   ],
   "source": [
    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
    "prompt = \"Summarize this paper in short.\"\n",
    "\n",
    "display(Video(video_path, width=640, height=360))\n",
    "\n",
    "## Use a local HuggingFace model to inference.\n",
    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
    "print(response[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8dd58bbd",
   "metadata": {},
   "source": [
    "#### Assistant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cea7d11",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
       "      Your browser does not support the <code>video</code> element.\n",
       "    </video>"
      ],
      "text/plain": [
       "<IPython.core.display.Video object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "system\n",
      "You are a helpful assistant.\n",
      "user\n",
      "Please trranslate the abstract of paper into Chinese.\n",
      "assistant\n",
      "The abstract of the paper \"Attention Is All You Need\" by Vaswani et al. (2017) is as follows:\n",
      "\n",
      "---\n",
      "\n",
      "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based entirely on self-attention. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.\n",
      "\n",
      "---\n",
      "\n",
      "Translation:\n",
      "\n",
      "---\n",
      "\n",
      "主流的序列转换模型基于复杂的递归或卷积神经网络，包括编码器和解码器。表现最好的模型还通过注意力机制连接编码器和解码器。我们提出了一种新的简单网络架构，Transformer，完全基于自我注意力。在两个机器翻译任务上的实验表明，这些模型在质量上更优，同时更易于并行化，并且训练时间显著减少。我们的模型在WMT 2014英语到德语翻译任务上实现了28.4 BLEU，超过现有最佳结果，包括组合模型，提高了2 BLEU。在WMT 2014英语到法语翻译任务上，我们的模型在仅训练3.5天、使用八个GPU的情况下，建立了新的单模型状态最佳BLEU得分41.8，比文献中最好的模型（包括组合模型）低2 BLEU。我们展示了Transformer在其他任务上的泛化能力，成功地将其应用于英语句法解析，即使在大量和有限的训练数据下也是如此。\n"
     ]
    }
   ],
   "source": [
    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
    "prompt = \"Please trranslate the abstract of paper into Chinese.\"\n",
    "\n",
    "display(Video(video_path, width=640, height=360))\n",
    "\n",
    "## Use a local HuggingFace model to inference.\n",
    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
    "print(response[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "omni",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
